#General
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
#Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.express as px
#Feature Engineering
from scipy import stats
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, RobustScaler
#Modeling
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, StratifiedKFold, GridSearchCV, RandomizedSearchCV, KFold, RepeatedKFold
from sklearn.feature_selection import f_classif, VarianceThreshold, SelectKBest, f_regression
from sklearn.feature_selection import RFECV, f_classif, VarianceThreshold, SelectKBest, f_regression
from yellowbrick.model_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score
from numpy import mean
from sklearn.ensemble import AdaBoostClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
# Read the data
df = pd.read_csv('data.csv')
columns = df.columns
print(df.shape)
print("total null values", df.isnull().sum().sum())
print("total potential duplicated rows", df.duplicated().sum())
# Data info overall
bankrupt, bankrupt_perc = (df["Bankrupt?"].value_counts(), round(df["Bankrupt?"].value_counts(normalize=True),2))
display(bankrupt, bankrupt_perc)
print("----------------------------------")
df.info()
# Visulize outcome's distribution using barplot
fig = px.bar(x=df['Bankrupt?'].value_counts().index,
y=df['Bankrupt?'].value_counts(),
text=(df['Bankrupt?'].value_counts()/len(df['Bankrupt?'])*100),
height=500, width=600, title='Bankrupcy')
fig.update_traces(textposition='outside', texttemplate='%{text:.4s}%',
marker=dict(color='snow', line=dict(color='black', width=3)))
fig.show()
# Visulize variables' distribution using boxplots
plt.figure(figsize = (20,20))
ax = sns.boxplot(data= df, orient="h")
ax.set_title('Features Boxplots', fontsize = 18)
ax.set(xscale="log")
plt.show()
# Visulize numerical data's distribution using histograms
df.hist(figsize = (35,30), bins = 50)
plt.show()
# Correlation Heatmap (Spearman)
df.corr('spearman')["Bankrupt?"].sort_values()
f, ax = plt.subplots(figsize=(30, 25))
mat = df.corr('spearman')
mask = np.triu(np.ones_like(mat, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(mat, mask=mask, cmap=cmap, vmax=1, center=0,# annot = True,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()
# Analyze variance and hypothesis testing of all features
bankrupt_df = df[df['Bankrupt?']==True]
not_bankrupt_df = df[df['Bankrupt?']==False]
cols = df.drop("Bankrupt?", axis=1).columns
for feature in cols:
a = bankrupt_df[feature]
b = not_bankrupt_df[feature]
b = b.sample(n=len(a), random_state=42) # Take random sample from each feature to match length of target
test = stats.ttest_ind(a,b) # Running t-tests
plt.figure()
sns.distplot(bankrupt_df[feature], kde=True, label="Bankrupt")
sns.distplot(not_bankrupt_df[feature], kde=True, label="Not Bankrupt")
plt.title("{} / p-value of t-test = :{}".format(feature, test[1]))
plt.legend()
Data type and quality:
Data type and quality
Out of the 6819 companies in the dataset:
First impressions:
# Drop constant columns (if any)
var_thres = VarianceThreshold().fit(df)
constant_columns = [column for column in df.columns if column not in df.columns[var_thres.get_support()]]
for feature in constant_columns:
print(feature)
df.drop(constant_columns,axis=1, inplace=True)
# Normalize data for faster processing
def data_scaling(DataFrame):
scaler = StandardScaler()
DataFrame.iloc[:,1:] = scaler.fit_transform(DataFrame.iloc[:,1:])
return(DataFrame)
df = data_scaling(df)
# split the whole dataset for training and testing purpose
# the splited y_train and y_test are hold out data (real-world data)
X = df.drop(columns=["Bankrupt?"])
y = df["Bankrupt?"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
df_train = pd.concat([y_train, X_train], axis=1, join="inner")
df_test = pd.concat([y_test, X_test], axis = 1, join = "inner")
# Sort columns from the less correlated to the most correlated
#df_train_corr = df_train.corr()
#df_train_corr = df_train_corr.reindex(df_train_corr["Bankrupt?"].abs().sort_values(ascending=True).index).T
#column_names = np.array(df_train_corr.columns)
#df_train = df_train.reindex(columns=column_names)
# Isolate the input features which have a high correlation between themselves
def correlation(dataset, threshold):
col_corr = set() # Set of all the names of correlated columns
corr_matrix = dataset.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
colname = corr_matrix.columns[i] # getting the name of column
col_corr.add(colname)
return col_corr
corr_features = correlation(X_train, 0.7)
display(len(corr_features))
corr_features
# Drop the highly correlated features:
X_train.drop(corr_features, axis=1, inplace=True)
X_test.drop(corr_features, axis=1, inplace=True)
df_train.drop(corr_features,axis=1, inplace=True)
df_train.shape
# Instantiate RFECV visualizer with a linear SVM classifier
visualizer_svc = RFECV(SVC(kernel='linear', C=1))
visualizer_svc.fit(X_train, y_train) # Fit the data to the visualizer
visualizer_svc.show() # Finalize and render the figure
# Display features' names
most_relevent_cols = df.iloc[:, 1:].columns[np.where(visualizer_svc.support_ == True)]
print("Most relevant features based on linear SVM classifier are: ")
print(most_relevent_cols)
# Instantiate RFECV visualizer with a RandomForest classifier
cv = StratifiedKFold(5)
visualizer_rf = RFECV(RandomForestClassifier(), cv=cv, scoring='f1_weighted')
visualizer_rf.fit(X_train, y_train)
visualizer_rf.show()
# Display features' names
most_relevent_cols = df.iloc[:, 1:].columns[np.where(visualizer_rf.support_ == True)]
print("Most relevant features based on RandomForest Classifier are: ")
print(most_relevent_cols)
# Instantiate RFECV visualizer with a LogisticRegression
# Set number of folds
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
# Minimum number of features to consider
min_features_to_select = 1
visualizer_lr = RFECV(LogisticRegression(max_iter=1000), cv=cv, scoring='f1_weighted',
min_features_to_select=min_features_to_select,n_jobs=1)
visualizer_lr.fit(X_train, y_train)
visualizer_lr.show()
# Display features' names
most_relevent_cols = df.iloc[:, 1:].columns[np.where(visualizer_lr.support_ == True)]
print("Most relevant features based on RandomForest Classifier are: ")
print(most_relevent_cols)
feature_list = ["Bankrupt?",
' ROA(C) before interest and depreciation before interest',
' Persistent EPS in the Last Four Seasons',
' ROA(B) before interest and depreciation after tax',
' Debt ratio %',
' Operating Gross Margin',
' Research and development expense rate',
' Continuous interest rate (after tax)',
' After-tax Net Profit Growth Rate',
' Realized Sales Gross Profit Growth Rate',
' Total Asset Growth Rate',
' Net Value Growth Rate',
' Total Asset Return Growth Rate Ratio',
' Current Ratio',
' Quick Ratio',
' Quick Assets/Total Assets',
' Revenue per person',
' Long-term fund suitability ratio (A)',
' Accounts Receivable Turnover',
' Net Worth Turnover Rate (times)',
' Current Liability to Assets',
' Interest Expense Ratio',
' Operating Profit Per Share (Yuan ¥)',
' Operating profit/Paid-in capital',
' Average Collection Days']
final_train = df.loc[:, df.columns.isin(feature_list)]
round(final_train.describe(),2)
# Get the score on correlation
fX = final_train.drop(columns=["Bankrupt?"])
fy = final_train["Bankrupt?"]
select_features = SelectKBest(score_func=f_classif, k=10).fit(fX, fy)
select_features_kbest = pd.DataFrame({'Features': list(fX.columns),'Scores': select_features.scores_})
select_features_kbest.sort_values(by='Scores', ascending = False)
# Remove the features with scores lower than 20
final_train.drop(' Total Asset Growth Rate', axis=1, inplace=True)
final_train.drop(' Revenue per person', axis=1, inplace=True)
final_train.drop(' After-tax Net Profit Growth Rate', axis=1, inplace=True)
final_train.drop(' Quick Ratio', axis=1, inplace=True)
final_train.drop(' Research and development expense rate', axis=1, inplace=True)
final_train.drop(' Net Worth Turnover Rate (times)', axis=1, inplace=True)
final_train.drop(' Long-term fund suitability ratio (A)', axis=1, inplace=True)
final_train.drop(' Total Asset Return Growth Rate Ratio', axis=1, inplace=True)
final_train.drop(' Continuous interest rate (after tax)', axis=1, inplace=True)
final_train.drop(' Average Collection Days', axis=1, inplace=True)
final_train.drop(' Accounts Receivable Turnover', axis=1, inplace=True)
final_train.drop(' Interest Expense Ratio', axis=1, inplace=True)
final_train.drop(' Current Ratio', axis=1, inplace=True)
final_train.drop(' Realized Sales Gross Profit Growth Rate', axis=1, inplace=True)
final_train.shape
# final plot of correlation
fig, ax = plt.subplots(figsize=(14,12))
sns.heatmap(final_train.corr(), vmin=-1, vmax=1, cmap=sns.diverging_palette(20, 220, as_cmap=True), annot=True)
# add bar&box plot for each numeric variable
for column in final_train.columns:
# set the figure size
plt.figure(figsize=(16,4))
# draw the bar chart
plt.subplot(1,2,1)
sns.distplot(final_train[column])
plt.xlabel(column)
plt.ylabel('Density')
plt.title(f'{column} Bar Distribution')
# draw the box chart, group by "y"
plt.subplot(1,2,2)
sns.boxplot(x="Bankrupt?", y=column, data =final_train, showmeans=True )
plt.xlabel("Bankrupt?")
plt.ylabel(column)
plt.title(f'{column} Box Distribution')
plt.show()
print()
# plot the imbalanced data again
plot_status_numberinit = final_train['Bankrupt?'].value_counts().plot(title = 'Bankrupt vs Bankrupt class', kind = 'barh', color = 'green')
plot_status_numberinit.set_xlabel("Bankrupt?")
plot_status_numberinit.set_ylabel("y class")
plt.show()
print(final_train['Bankrupt?'].value_counts())
final_trainX = final_train.drop(columns=["Bankrupt?"])
final_trainy = final_train["Bankrupt?"]
# oversample the minority class, using the SMOTE method
oversample = SMOTE()
new_train_X, new_train_y = oversample.fit_resample(final_trainX, final_trainy)
plot_status_numberinit = new_train_y.value_counts().plot(title = 'Bankrupt vs Bankrupt class', kind = 'barh', color = 'green')
plot_status_numberinit.set_xlabel("Bankrupt")
plot_status_numberinit.set_ylabel("Bankrupt class")
plt.show()
df_train_final = pd.concat([new_train_y, new_train_X], axis=1, join='inner')
print(df_train_final.shape)
df_train_final.head()
# resplit the train data to training part and validation part (for model testing)
train_x, val_x, train_y, val_y = train_test_split(df_train_final.drop('Bankrupt?',axis=1),
df_train_final['Bankrupt?'],
test_size=0.3, random_state = 42)
# show the shape of the train and validation data
train_x.shape, val_x.shape, train_y.shape, val_y.shape
plot_status_numberinit = df_train['Bankrupt?'].value_counts().plot(title = 'Bankrupt vs Bankrupt class', kind = 'barh', color = 'green')
plot_status_numberinit.set_xlabel("Bankrupt?")
plot_status_numberinit.set_ylabel("y class")
plt.show()
print(final_train['Bankrupt?'].value_counts())
df_test = df_test.loc[:, df_test.columns.isin(feature_list)]
df_test.drop([' Total Asset Growth Rate',' Revenue per person', ' After-tax Net Profit Growth Rate',' Quick Ratio', ' Research and development expense rate',
' Net Worth Turnover Rate (times)',' Long-term fund suitability ratio (A)',' Total Asset Return Growth Rate Ratio',
' Continuous interest rate (after tax)',' Average Collection Days',' Accounts Receivable Turnover', ' Interest Expense Ratio',' Current Ratio',' Realized Sales Gross Profit Growth Rate'], axis=1, inplace=True)
test_x = df_test.iloc[:,1:]
test_y = df_test.iloc[:,0]
# oversample the minority class, using the SMOTE method
oversample = SMOTE()
test_x, test_y = oversample.fit_resample(test_x, test_y)
plot_status_numberinit = test_y.value_counts().plot(title = 'Bankrupt vs Bankrupt class', kind = 'barh', color = 'green')
plot_status_numberinit.set_xlabel("Bankrupt")
plot_status_numberinit.set_ylabel("Bankrupt class")
plt.show()
train_y.shape
logistic_model = LogisticRegression(random_state = 42)
logistic_model.fit(train_x, train_y)
lg_base_probs = logistic_model.predict_proba(test_x)
lg_base_pred = logistic_model.predict(test_x)
lg_base_probs_prob = lg_base_probs[:,1]
print(confusion_matrix(lg_base_pred, test_y))
print(classification_report(lg_base_pred, test_y))
print(roc_auc_score(test_y, lg_base_probs_prob))
lg_y_train_pred = logistic_model.predict(train_x)
print(classification_report(train_y, lg_y_train_pred))
print(roc_auc_score(train_y, logistic_model.predict_proba(train_x)[:,1]))
rf_model = RandomForestClassifier(random_state = 42)
rf_model.fit(train_x, train_y)
rf_base_probs = rf_model.predict_proba(test_x)
rf_base_pred = rf_model.predict(test_x)
rf_base_probs_prob = rf_base_probs[:,1]
print(confusion_matrix(rf_base_pred, test_y))
print(classification_report(rf_base_pred, test_y))
print(roc_auc_score(test_y, rf_base_probs_prob))
rf_y_train_pred = rf_model.predict(train_x)
print(classification_report(train_y, rf_y_train_pred))
print(roc_auc_score(train_y, rf_model.predict_proba(train_x)[:,1]))
feature_importance = pd.DataFrame({'name': test_x.columns, 'importance': rf_model.feature_importances_})
feature_importance.sort_values('importance', ascending=False, inplace=True)
feature_importance = feature_importance.iloc[0:5,:]
feature_importance.iloc[:,0:4].plot.bar(x='name', y='importance')
# early stopping
early_stop = EarlyStopping(monitor='val_auc',mode='max', verbose=1, patience=27,restore_best_weights=True)
# ANN
model = Sequential()
model.add(Dense(units=10,activation='relu'))
model.add(Dropout(0.10))
model.add(Dense(units=4,activation='relu'))
model.add(Dense(units=1,activation='sigmoid'))
# compile ANN
model.compile(loss='binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
# Train ANN
model.fit(x=train_x,
y=train_y,
epochs=120,
validation_data=(val_x, val_y), verbose=1,
callbacks=[early_stop]
)
# model history to df
loss_plot = pd.DataFrame(model.history.history)
accuracy_plot = pd.DataFrame(model.history.history)
# accuracy and loss plot
fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(14,4))
plt.style.use('seaborn')
ax1.plot(loss_plot.loc[:, ['loss']], label='Training loss');
ax1.plot(loss_plot.loc[:, ['val_loss']],label='Validation loss');
ax1.set_title('Training and Validation loss')
ax1.set_xlabel('epochs')
ax1.set_ylabel('Loss')
ax1.legend(loc="best");
ax2.plot(accuracy_plot.loc[:, ['accuracy']],label='Training_accuracy');
ax2.plot(accuracy_plot.loc[:, ['val_accuracy']], label='Validation_accuracy');
ax2.set_title('Training_and_Validation_accuracy');
ax2.set_xlabel('epochs')
ax2.set_ylabel('accuracy')
ax2.legend(loc="best");
y_pred = model.predict(test_x)
y_pred = (y_pred > 0.5)
test_x.shape
sns.heatmap(confusion_matrix(test_y,y_pred,normalize='true'), annot=True)
print(classification_report(test_y, y_pred))
print(roc_auc_score(test_y, model.predict_proba(test_x)))
ann_y_train_pred = model.predict(train_x)
ann_y_train_pred = (ann_y_train_pred > 0.5)
print(classification_report(train_y, ann_y_train_pred))
print(roc_auc_score(train_y, model.predict_proba(train_x)))
gb_model = GradientBoostingClassifier()
parameters = {
'n_estimators': [100, 200, 300, 400],
'learning_rate': [0.2,0.4,0.6],
'max_depth': [1,2]
}
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
gb_grid = GridSearchCV(gb_model, parameters, cv=cv, scoring='roc_auc',refit=True, n_jobs=-1, verbose=5)
gb_grid.fit(train_x, train_y)
gb_model2 = gb_grid.best_estimator_
gb_model2.fit(train_x, train_y)
gb_model2
gb_base_probs = gb_model2.predict_proba(test_x)
gb_base_pred = gb_model2.predict(test_x)
gb_base_probs_prob = gb_base_probs[:,1]
print(confusion_matrix(gb_base_pred, test_y))
print(classification_report(gb_base_pred, test_y))
print(roc_auc_score(test_y, gb_base_probs_prob))
gb_y_train_pred = gb_model2.predict(train_x)
print(classification_report(train_y, gb_y_train_pred))
print(roc_auc_score(train_y, gb_model2.predict_proba(train_x)[:,1]))
from pydotplus import graph_from_dot_data
from sklearn.tree import export_graphviz
from IPython.display import Image
# Get the tree number 42
sub_tree_42 = gb_model2.estimators_[1, 0]
dot_data = export_graphviz(
sub_tree_42,
out_file=None, filled=True, rounded=True,
special_characters=True,
proportion=False, impurity=False, # enable them if you want
)
graph = graph_from_dot_data(dot_data)
Image(graph.create_png())
# get importance
importance = gb_model2.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()
feature_importance = pd.DataFrame({'name': test_x.columns, 'importance': gb_model2.feature_importances_})
feature_importance.sort_values('importance', ascending=False, inplace=True)
feature_importance = feature_importance.iloc[0:5,:]
feature_importance.iloc[:,0:4].plot.bar(x='name', y='importance')
# create Random Forest model
adb = AdaBoostClassifier()
param_grid = {'n_estimators':[100,200,300,400],
'learning_rate':[0.2,0.4,0.6,0.8,1,1.2],
'random_state': [0]}
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# Create gridsearch object with various combinations of parameters
adb_Grid = GridSearchCV(adb, param_grid, cv = cv, scoring = 'roc_auc',refit = True, n_jobs=-1, verbose = 5)
adb_Grid.fit(X_train, y_train)
adb_model2 = adb_Grid.best_estimator_
adb_model2.fit(train_x, train_y)
adb_model2
adb_base_probs = adb_model2.predict_proba(test_x)
adb_base_pred = adb_model2.predict(test_x)
adb_base_probs_prob = adb_base_probs[:,1]
print(confusion_matrix(adb_base_pred, test_y))
print(classification_report(adb_base_pred, test_y))
print(roc_auc_score(test_y, adb_base_probs_prob))
adb_y_train_pred = adb_model2.predict(train_x)
print(classification_report(train_y, adb_y_train_pred))
print(roc_auc_score(train_y, adb_model2.predict_proba(train_x)[:,1]))
feature_importance = pd.DataFrame({'name': test_x.columns, 'importance': ada_model.feature_importances_})
feature_importance.sort_values('importance', ascending=False, inplace=True)
feature_importance = feature_importance.iloc[0:5,:]
feature_importance.iloc[:,0:4].plot.bar(x='name', y='importance')
xgb = xgb.XGBClassifier()
param_grid_2 = {'n_estimators':[int(x) for x in np.linspace(start = 100, stop = 1000, num = 19)],
'learning_rate':[x for x in np.linspace(start = 0.1, stop = 1.6, num = 16)],
'max_depth':[1,2],
'gamma':[x for x in np.linspace(start = 0, stop = 5, num = 21)]}
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# Create gridsearch object with various combinations of parameters
xgb_Grid = RandomizedSearchCV(xgb, param_grid_2, cv = cv, scoring = 'roc_auc',refit = True, n_jobs=-1, verbose = 5)
xgb_Grid.fit(X_train, y_train)
xgb_model2 = xgb_Grid.best_estimator_
xgb_model2.fit(train_x, train_y)
xgb_model2
xgb_base_probs = xgb_model2.predict_proba(test_x)
xgb_base_pred = xgb_model2.predict(test_x)
xgb_base_probs_prob = xgb_base_probs[:,1]
print(confusion_matrix(xgb_base_pred, test_y))
print(classification_report(xgb_base_pred, test_y))
print(roc_auc_score(test_y, xgb_base_probs_prob))
xgb_y_train_pred = xgb_model2.predict(train_x)
print(classification_report(train_y, xgb_y_train_pred))
print(roc_auc_score(train_y, xgb_model2.predict_proba(train_x)[:,1]))
feature_importance = pd.DataFrame({'name': test_x.columns, 'importance': ada_model.feature_importances_})
feature_importance.sort_values('importance', ascending=False, inplace=True)
feature_importance = feature_importance.iloc[0:5,:]
feature_importance.iloc[:,0:4].plot.bar(x='name', y='importance')
# list of strings
list0 = ['Name', 'Test_f1_score', 'Test_roc_auc_score', 'Train_f1_score', 'Train_roc_auc_score']
list1 = ['Logistic Regression', 0.86, 0.938, 0.87, 0.94]
list2 = ['Random Forest', 0.94, 0.989, 1, 1]
list3 = ['ANN', '0.88', 0.952, 0.90, 0.961]
list4 = ['Gradient Boost', 0.92, 0.977, 0.98, 0.997]
list5 = ['Ada Boost', 0.87, 0.9426, 0.88, 0.951]
list6 = ['XGBoost', 0.87, 0.943, 0.89, 0.952]
# Calling DataFrame constructor after zipping
# both lists, with columns specified
df = pd.DataFrame(list(zip(list0, list1, list2, list3, list4, list5, list6)))
df = df.T
df
new_header = df.iloc[0]
df = df[1:]
df.columns = new_header
df.rename(columns=new_header)